import time
import re
import json
import urllib
import scrapy
from apps.tax_policy.tax_policy.items import NetTaxPolicyItem as Item
from apps.tax_policy.tax_policy.spiders.base_spider.base_tax_policy_spider import BaseTaxPolicySpider
import math
import base64
from urllib.parse import urlencode


class spider(BaseTaxPolicySpider):
    name = "chaoyang_lnscysczj"

    province: str = "辽宁省"  # 取表格
    city: str = "朝阳市"  # 取表格
    county: str = ""  # 取表格
    park: str = "None"  # 取表格
    source: str = "辽宁省朝阳市财政局"  # 取表格 同一个来源合并
    url: str = "http://czj.chaoyang.gov.cn/cysczj/zwgk/zdgkwj/glist.html"  # 注明入口网址，以便后续排错
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive',
        'Pragma': 'no-cache',
        'Referer': 'https://wlgdj.chaoyang.gov.cn/cyswhlyhgbdsj/zwgk/bmwj/glist9.html',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'cross-site',
        'Sec-Fetch-User': '?1',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36',
        'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
    }
    refer_url = ''

    def start_requests(self):
        url = "https://czj.chaoyang.gov.cn/cysczj/zwgk/zdgkwj/glist.html"

        # 设置请求头

        # 发送GET请求
        yield scrapy.Request(url=url, method='GET', headers=self.headers, callback=self.detail_requests)

    def detail_requests(self, response, **kwargs):
        page = response.meta.get('page', "")
        links = response.xpath("//ul[@class='list1']//a/@href").getall()  # 取链接
        title_list = response.xpath("//ul[@class='list1']//a/b/text()").getall()  # 取标题
        pub_date = response.xpath("//ul[@class='list1']//a/span/text()").getall()  # 取标题
        print("links", links, page)
        print("pub_date", pub_date, page)
        print("title_list", title_list, page)
        if len(title_list) == len(links) == len(pub_date):
            for i in range(len(links)):
                if 'http' not in links[i]:
                    if links[i].count(".") == 5:
                        links[i] = f"https://{links[i][5:]}"
                    if links[i].count(".") == 1:
                        links[i] = f"https://{links[i]}"
                print("links[i]", links[i], 'page', page)
                yield scrapy.Request(links[i], headers=self.headers, callback=self.parse_detail,
                                     meta={'pubdate': pub_date[i],
                                           'title': title_list[i]})
        else:
            raise ("出错了", page)

        # 翻页逻辑
        if response.meta.get("is_next") is not False:
            pattern = r"""(\d+) 页"""
            match = re.search(pattern, response.text)
            pages = int(match.group(1))
            if pages > 1:
                print("总页数：", pages)
                for page in range(2, pages + 1):
                    print("当前页：", page)
                    url = response.url.replace(".html", "") + str(page - 1) + ".html"
                    yield scrapy.Request(url=url, headers=self.headers, callback=self.detail_requests,
                                         meta={"is_next": False, 'page': page})

    def parse_list(self, response, **kwargs):
        pass

    def parse_detail(self, response, **kwargs):
        item = Item()
        title = response.meta.get('title')

        if '.pdf' in response.url or '.doc' in response.url or '.xls' in response.url or '.jpg' in response.url:
            print("特殊url：", response.url)
            content = response.url
            pub_date = response.meta.get('pubdate')
        else:
            content = response.xpath(".").get()
            pub_date = response.xpath("""string(//meta[@name='PubDate']/@content)""").get()
        item['title'] = title
        item['source_url'] = response.url
        item['publish_date'] = pub_date
        item['content'] = content
        item['source'] = self.source
        item['province'] = self.province
        item['city'] = self.city
        item['county'] = self.county
        item['park'] = self.park
        # print("item", item)
        yield item


if __name__ == "__main__":
    from scrapy import cmdline

    cmdline.execute("scrapy crawl chaoyang_lnscysczj".split())
